 1.Spark原理之作业执行原理中Task调度
   
   Task 的调度是由 TaskScheduler 来完成(底层调度)。
   DAGScheduler 将 Stage 打包到 TaskSet 交给TaskScheduler，TaskScheduler 会将 TaskSet 封装为
TaskSetManager 加入到调度队列中，TaskSetManager 结构如下图所示：
   TaskSetManager 负责监控管理同一个 Stage 中的 Tasks，TaskScheduler 以TaskSetManager为单元来调度任务。
   TaskScheduler 初始化后会启动 SchedulerBackend。(在 SparkContext 源码中)
   SchedulerBackend负责跟外界打交道，接收 Executor 的注册，维护 Executor 的状态。 SchedulerBackend 是
管“资源”（Executor）的，它在启动后会定期地去“询问” TaskScheduler 有没有任务要运行。
   TaskScheduler 在 SchedulerBackend “问”它的时候，会从调度队列中按照指定的调度策略选择 TaskSetManager 去
调度运行，大致方法调用流程如下图所示：
   将 TaskSetManager 加入 rootPool 调度池中之后，调用 SchedulerBackend 的 reviveOffers 方法给driverEndpoint
发送 ReviveOffer 消息；driverEndpoint 收到 ReviveOffer 消息后调用 makeOffers 方法，过滤出活跃状态的Executor
(这些 Executor都是任务启动时反向注册到 Driver 的 Executor)，然后将 Executor 封装成 WorkerOffer对象:准备好
计算资源(WorkerOffer)后 ， taskScheduler基于这些资源调用resourceOffer在Executor上分配 task。
   1).submitTasks
  // 处理传入的TaskSet
  override def submitTasks(taskSet: TaskSet) {
    // 获取TaskSet中的所有Task
	val tasks = taskSet.tasks
    logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")
    this.synchronized {
      // 创建TaskSetManager
	  val manager = createTaskSetManager(taskSet, maxTaskFailures)
      // TaskSet的Stage
	  val stage = taskSet.stageId
      // 更新taskSetsByStageIdAndAttempt中记录的推测执行信息
	  val stageTaskSets =
        taskSetsByStageIdAndAttempt.getOrElseUpdate(stage, new HashMap[Int, TaskSetManager])

      
      stageTaskSets.foreach { case (_, ts) =>
        ts.isZombie = true
      }
      stageTaskSets(taskSet.stageAttemptId) = manager
	  
	  // 判断是否有冲突的TaskSet，taskSetsByStageIdAndAttempt中不应该存在同属于当前Stage，但是
      // TaskSet却不相同的情况
	  val conflictingTaskSet = stageTaskSets.exists { case (_, ts) =>
        ts.taskSet != taskSet && !ts.isZombie
      }
      if (conflictingTaskSet) {
        throw new IllegalStateException(s"more than one active taskSet for stage $stage:" +
          s" ${stageTaskSets.toSeq.map{_._2.taskSet.id}.mkString(",")}")
      }
	  
	  // 将刚创建的TaskSetManager添加到调度池构建器的调度池中
      schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties)
      
	  
	  // 当前应用程序不是Local模式并且TaskSchedulerImpl还没有接收到Task
      if (!isLocal && !hasReceivedTask) {
        // 设置检查TaskSchedulerImpl的饥饿状况的定时器
		starvationTimer.scheduleAtFixedRate(new TimerTask() {
          // 定时检查TaskSchedulerImpl的饥饿状况
		  override def run() {
            if (!hasLaunchedTask) {
              logWarning("Initial job has not accepted any resources; " +
                "check your cluster UI to ensure that workers are registered " +
                "and have sufficient resources")
            } else {
              // 当TaskSchedulerImpl已经运行Task后，取消此定时器
			  this.cancel()
            }
          }
        }, STARVATION_TIMEOUT_MS, STARVATION_TIMEOUT_MS)
      }
      // 表示TaskSchedulerImpl已经接收到Task
	  hasReceivedTask = true
    }
    // 给Task分配资源并运行Task
	backend.reviveOffers()
  }
   2).makeOffers
   private def makeOffers(executorId: String) {
     // Filter out executors under killing
     // 先判断Executor是否是激活的
     if (executorIsAlive(executorId)) {
       // 获取对应的ExecutorData对象
       val executorData = executorDataMap(executorId)
       // 创建WorkerOffer样例类对象
       val workOffers = IndexedSeq(
         new WorkerOffer(executorId, executorData.executorHost, executorData.freeCores))
       // 分配资源并运行Task
       launchTasks(scheduler.resourceOffers(workOffers))
     }
  }
   3).resourceOffers
   // 用于给Task分配资源
   def resourceOffers(offers: IndexedSeq[WorkerOffer]): Seq[Seq[TaskDescription]] = synchronized {
    // Mark each slave as alive and remember its hostname
    // Also track if new executor is added
    var newExecAvail = false
    // 遍历WorkerOffer序列
	for (o <- offers) {
      // 先将资源中的主机记录更新到hostToExecutors字典中
	  if (!hostToExecutors.contains(o.host)) {
        hostToExecutors(o.host) = new HashSet[String]()
      }
	  // 更新Host与Executor的各种映射关系
      if (!executorIdToRunningTaskIds.contains(o.executorId)) {
        hostToExecutors(o.host) += o.executorId\
		// 向DAGScheduler的DAGSchedulerEventProcessLoop投递ExecutorAdded事件
		// 告知有新的Executor添加了
        executorAdded(o.executorId, o.host)
        executorIdToHost(o.executorId) = o.host
        executorIdToRunningTaskIds(o.executorId) = HashSet[Long]()
        // 标记添加了新的Executor
		newExecAvail = true
      }
	  // 更新Host与机架之间的关系
      for (rack <- getRackForHost(o.host)) {
        hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += o.host
      }
    }

    // Before making any offers, remove any nodes from the blacklist whose blacklist has expired. Do
    // this here to avoid a separate thread and added synchronization overhead, and also because
    // updating the blacklist is only relevant when task offers are being made.
    blacklistTrackerOpt.foreach(_.applyBlacklistTimeout())
    
	// 随机洗牌，避免将任务总是分配给同样一组Worker
    val filteredOffers = blacklistTrackerOpt.map { blacklistTracker =>
      offers.filter { offer =>
        !blacklistTracker.isNodeBlacklisted(offer.host) &&
          !blacklistTracker.isExecutorBlacklisted(offer.executorId)
      }
    }.getOrElse(offers)

    val shuffledOffers = shuffleOffers(filteredOffers)
    // Build a list of tasks to assign to each worker.
	// 根据每个WorkerOffer的可用的CPU核数创建同等尺寸的TaskDescription数组
	// 从这里可以看出，每个CPU Core只供给一个Task使用
    val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores / CPUS_PER_TASK))
    // 统计每个Worker的可用的CPU核数
	val availableCpus = shuffledOffers.map(o => o.cores).toArray
    // 对rootPool中所有TaskSetManager按照调度算法排序
	val sortedTaskSets = rootPool.getSortedTaskSetQueue
    // 遍历所有的TaskSetManager，如果有新的Executor添加就告诉它们，它们会重新计算支持的本地性级别。
	/**
      * 遍历TaskSetManager，
      * 在单个TaskSetManager中，按照最大本地性的原则（即从高本地性级别到低本地性级别）
      * 调用resourceOfferSingleTaskSet()方法，给单个TaskSet中的Task提供资源
      */
	for (taskSet <- sortedTaskSets) { 
      logDebug("parentName: %s, name: %s, runningTasks: %s".format(
        taskSet.parent.name, taskSet.name, taskSet.runningTasks))
      if (newExecAvail) {
        taskSet.executorAdded()
      }
    }

    // Take each TaskSet in our scheduling order, and then offer it each node in increasing order
    // of locality levels so that it gets a chance to launch local tasks on all of them.
    // NOTE: the preferredLocality order: PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY
    for (taskSet <- sortedTaskSets) { // 循环根据调度算法排好序的待执行Task
      val availableSlots = availableCpus.map(c => c / CPUS_PER_TASK).sum
      // Skip the barrier taskSet if the available slots are less than the number of pending tasks.
      if (taskSet.isBarrier && availableSlots < taskSet.numTasks) {
        // Skip the launch process.
        // TODO SPARK-24819 If the job requires more slots than available (both busy and free
        // slots), fail the job on submit.
        logInfo(s"Skip current round of resource offers for barrier stage ${taskSet.stageId} " +
          s"because the barrier taskSet requires ${taskSet.numTasks} slots, while the total " +
          s"number of available slots is $availableSlots.")
      } else {
        var launchedAnyTask = false
        // Record all the executor IDs assigned barrier tasks on.
        val addressesWithDescs = ArrayBuffer[(String, TaskDescription)]()
        // 对单个TaskSetManager，遍历它所支持的的本地化级别，按照最大本地性的原则，给Task提供资源
		for (currentMaxLocality <- taskSet.myLocalityLevels) {
          var launchedTaskAtCurrentMaxLocality = false
          do {
			/**
              * 调用resourceOfferSingleTaskSet()方法为单个TaskSetManager分配资源，
              * 最终分配到资源的Task对应的TaskDescription会被放入到tasks数组中，
              * 返回值表示是否有Task被分配了资源
              */  
            launchedTaskAtCurrentMaxLocality = resourceOfferSingleTaskSet(taskSet,
              currentMaxLocality, shuffledOffers, availableCpus, tasks, addressesWithDescs)
            launchedAnyTask |= launchedTaskAtCurrentMaxLocality
          } while (launchedTaskAtCurrentMaxLocality)
        }
        // 如果在任何TaskSet所允许的本地性级别下，TaskSet中没有任何一个任务获得了资源
        if (!launchedAnyTask) {
          taskSet.getCompletelyBlacklistedTaskIfAny(hostToExecutors).foreach { taskIndex =>
              // If the taskSet is unschedulable we try to find an existing idle blacklisted
              // executor. If we cannot find one, we abort immediately. Else we kill the idle
              // executor and kick off an abortTimer which if it doesn't schedule a task within the
              // the timeout will abort the taskSet if we were unable to schedule any task from the
              // taskSet.
              // Note 1: We keep track of schedulability on a per taskSet basis rather than on a per
              // task basis.
              // Note 2: The taskSet can still be aborted when there are more than one idle
              // blacklisted executors and dynamic allocation is on. This can happen when a killed
              // idle executor isn't replaced in time by ExecutorAllocationManager as it relies on
              // pending tasks and doesn't kill executors on idle timeouts, resulting in the abort
              // timer to expire and abort the taskSet.
              executorIdToRunningTaskIds.find(x => !isExecutorBusy(x._1)) match {
                case Some ((executorId, _)) =>
                  if (!unschedulableTaskSetToExpiryTime.contains(taskSet)) {
                    blacklistTrackerOpt.foreach(blt => blt.killBlacklistedIdleExecutor(executorId))

                    val timeout = conf.get(config.UNSCHEDULABLE_TASKSET_TIMEOUT) * 1000
                    unschedulableTaskSetToExpiryTime(taskSet) = clock.getTimeMillis() + timeout
                    logInfo(s"Waiting for $timeout ms for completely "
                      + s"blacklisted task to be schedulable again before aborting $taskSet.")
                    abortTimer.schedule(
                      createUnschedulableTaskSetAbortTimer(taskSet, taskIndex), timeout)
                  }
                case None => // Abort Immediately
                  logInfo("Cannot schedule any task because of complete blacklisting. No idle" +
                    s" executors can be found to kill. Aborting $taskSet." )
                  taskSet.abortSinceCompletelyBlacklisted(taskIndex)
              }
          }
        } else {
          // We want to defer killing any taskSets as long as we have a non blacklisted executor
          // which can be used to schedule a task from any active taskSets. This ensures that the
          // job can make progress.
          // Note: It is theoretically possible that a taskSet never gets scheduled on a
          // non-blacklisted executor and the abort timer doesn't kick in because of a constant
          // submission of new TaskSets. See the PR for more details.
          if (unschedulableTaskSetToExpiryTime.nonEmpty) {
            logInfo("Clearing the expiry times for all unschedulable taskSets as a task was " +
              "recently scheduled.")
            unschedulableTaskSetToExpiryTime.clear()
          }
        }

        if (launchedAnyTask && taskSet.isBarrier) {
          // Check whether the barrier tasks are partially launched.
          // TODO SPARK-24818 handle the assert failure case (that can happen when some locality
          // requirements are not fulfilled, and we should revert the launched tasks).
          require(addressesWithDescs.size == taskSet.numTasks,
            s"Skip current round of resource offers for barrier stage ${taskSet.stageId} " +
              s"because only ${addressesWithDescs.size} out of a total number of " +
              s"${taskSet.numTasks} tasks got resource offers. The resource offers may have " +
              "been blacklisted or cannot fulfill task locality requirements.")

          // materialize the barrier coordinator.
          maybeInitBarrierCoordinator()

          // Update the taskInfos into all the barrier task properties.
          val addressesStr = addressesWithDescs
            // Addresses ordered by partitionId
            .sortBy(_._2.partitionId)
            .map(_._1)
            .mkString(",")
          addressesWithDescs.foreach(_._2.properties.setProperty("addresses", addressesStr))

          logInfo(s"Successfully scheduled all the ${addressesWithDescs.size} tasks for barrier " +
            s"stage ${taskSet.stageId}.")
        }
      }
    }

    // TODO SPARK-24823 Cancel a job that contains barrier stage(s) if the barrier tasks don't get
    // launched within a configured time.
    if (tasks.size > 0) {
      hasLaunchedTask = true
    }
	// 返回已经获得了资源的TaskDescription列表
    return tasks
  }
  
  private def computeValidLocalityLevels(): Array[TaskLocality.TaskLocality] = {
	// 导入本地化级别的标量  
    import TaskLocality.{PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY}
    // 构造一个数组
	val levels = new ArrayBuffer[TaskLocality.TaskLocality]
    if (!pendingTasksForExecutor.isEmpty && // Executor上待处理Task集合不为空
	  getLocalityWait(PROCESS_LOCAL) != 0 && // PROCESS_LOCAL级别的等待时间不为0
	    // 还存在已被激活的Executor
        pendingTasksForExecutor.keySet.exists(sched.isExecutorAlive(_))) {
      levels += PROCESS_LOCAL // 允许的本地性级别里包括PROCESS_LOCAL
    }
    if (!pendingTasksForHost.isEmpty && // Host上待处理的Task集合不为空
	  getLocalityWait(NODE_LOCAL) != 0 && // NODE_LOCAL级别的等待时间不为0
	    // Host上存在已被激活的Executor
        pendingTasksForHost.keySet.exists(sched.hasExecutorsAliveOnHost(_))) {
      levels += NODE_LOCAL // 允许的本地性级别里包括NODE_LOCAL
    }
    if (!pendingTasksWithNoPrefs.isEmpty) { // 存在没有任何本地性偏好的待处理Task
      levels += NO_PREF // 允许的本地性级别里包括NO_PREF
    }
    if (!pendingTasksForRack.isEmpty && // 机架上待处理的Task的集合不为空
	  getLocalityWait(RACK_LOCAL) != 0 && // RACK_LOCAL级别的等待时间不为0
	    // 机架上存在已被激活的Executor
        pendingTasksForRack.keySet.exists(sched.hasHostAliveOnRack(_))) {
      levels += RACK_LOCAL // 允许的本地性级别里包括RACK_LOCAL
    }
    levels += ANY // 允许的本地性级别里增加ANY
    logDebug("Valid locality levels for " + taskSet + ": " + levels.mkString(", "))
    levels.toArray // 返回所有允许的本地性级别
  }
   4).launchTasks
   private def launchTasks(tasks: Seq[Seq[TaskDescription]]) {
      for (task <- tasks.flatten) {
		// 对TaskDescription进行序列化
        val serializedTask = TaskDescription.encode(task)
        if (serializedTask.limit() >= maxRpcMessageSize) { // 序列化后的大小超出了Rpc消息的限制
          Option(scheduler.taskIdToTaskSetManager.get(task.taskId)).foreach { taskSetMgr =>
            try {
              var msg = "Serialized task %s:%d was %d bytes, which exceeds max allowed: " +
                "spark.rpc.message.maxSize (%d bytes). Consider increasing " +
                "spark.rpc.message.maxSize or using broadcast variables for large values."
              msg = msg.format(task.taskId, task.index, serializedTask.limit(), maxRpcMessageSize)
              // 放弃对TaskSetManager的调度
			  taskSetMgr.abort(msg)
            } catch {
              case e: Exception => logError("Exception in error callback", e)
            }
          }
        }
        else { // 序列化后的TaskDescription的大小小于RPC消息大小的最大值maxRpcMessageSize
          val executorData = executorDataMap(task.executorId)
          // 减少Executor的空闲内核数freeCores
		  executorData.freeCores -= scheduler.CPUS_PER_TASK

          logDebug(s"Launching task ${task.taskId} on executor id: ${task.executorId} hostname: " +
            s"${executorData.executorHost}.")
          // 向CoarseGrainedExecutorBackend发送LaunchTask消息。
          // CoarseGrainedExecutorBackend将在收到LaunchTask消息后运行Task。
          executorData.executorEndpoint.send(LaunchTask(new SerializableBuffer(serializedTask)))
        }
      }
    }
   
   